9 Differential Expression

We then compare the protein expression between strains for each pairwise comparison (28 unique pairwise combinations)

To detect differentially expressed genes, we use the limma analysis on normalized protein expression.

9.1 Volcano plots

First, we can look at the volcano plots (log-foldchange vs qvalue) for each unique pairwise comparison.

#ind_na_rows  = find_na_rows(int_norm,as.indices = T)
df_imputed = tibble( uniprot = int_norm$uniprot,
                     is_imputed = (rowSums( is.na(int_norm))>0)* 1, 
                     imputed = factor(is_imputed, levels = c(0,1), labels = c("not", "is_imputed")))
volcPlot(INPUT=int_norm_bpca, IMPUTED=df_imputed, MIN_LFC=2, MIN_PVAL=0.01, WHICH='both', TOPN = 20, plot = F, use_plotly = T)
#> Joining, by = c("ID", "pValue", "qValue", "EffectSize",
#> "comparison", "sig", "log10_qvalue", "SGD", "ORF",
#> "UNIPROT", "GENENAME", "is_imputed", "imputed")

9.2 Differentially expressed genes

# Without NA
volcano_data =  get_volcano_data(input_data=int_norm_nona, min_lfc=2, min_pval=0.01, topn = 50) %>%
                bind_rows() %>% as_tibble()
#> selecting both up and down regulated genes...
dfe_nona = subset(volcano_data,sig!='Non significant')
N_DFE_NONA = nrow(dfe_nona)
NPROT_DFE_NONA =  n_distinct(dfe_nona$ID)
R_NONA = N_DFE_NONA/NPROT_DFE_NONA %>% round(0)
# With imputed expression to replace NA
dfe = get_dfe(INPUT=int_norm_bpca, WHICH='both',MIN_LFC=2, MIN_PVAL=0.01, TOPN = 20, 
              count_up=T, count_down=T)
#> Joining, by = "ID"
#> Joining, by = "ID"

N_DFE_IMPUTE = nrow(dfe)
NPROT_DFE_IMPUTE =  n_distinct(dfe$ID)
R_IMPUTE= N_DFE_IMPUTE/NPROT_DFE_IMPUTE %>% round(0)

# get_dfe(int_norm, MIN_LFC=2, MIN_PVAL=0.01,  WHICH='both', TOPN = 20) %>% remove_rownames() %>% 
#   dplyr::left_join(sc_identifiers, by=c('ID'='UNIPROT'))

# Number of times a hit is differentially expressed
df_dfe = dfe %>% 
         left_join(janitor::tabyl(dfe,ID,sig)) %>%
         rename(uniprot=ID) %>% 
         group_by(uniprot,comparison) %>% 
          mutate( n_strains_up = sum(c_across(starts_with('up_')) !=0 ),
                n_strains_down = sum(c_across(starts_with('down_'))!=0)) %>%
        replace_na(list(n_strains_up=0,n_strains_down=0)) %>%
        left_join(evo_yeast, by=c('uniprot'='UNIPROT')) %>% 
        relocate(uniprot,GENENAME,sig,pValue,qValue,EffectSize,comparison,
                 Downregulated,Upregulated,n_strains_up,n_strains_down,orf)
#> Joining, by = "ID"
  

df_dfe_annot = df_dfe %>%
          left_join(sc_annotation_orf,by=c('uniprot'='UNIPROT')) %>%
  mutate(uniprot_link = paste0("<a href='https://www.uniprot.org/uniprot/",uniprot,"'>",uniprot,"</a>"),
         sgd_link = paste0("<a href='https://www.yeastgenome.org/locus/",SGD,"'>",SGD,"</a>"),
         regulated = Downregulated+Upregulated) %>% 
  dplyr::relocate(uniprot,uniprot_link,sgd_link,regulated,Downregulated,Upregulated, 
                  GENENAME,ORF,PNAME,'FUNCTION','BIOPROCESS_all','ORTHO','OTHER')

We get 225 genes differentially expressed when excluding genes with missing expression in any samples.

On average, each gene is detected in 3.4888889 unique pairwise strain comparison.

After imputation of missing expression with bpca (Bayesian missing value imputation), we get 230 more genes differentially expressed (n=455).

On average, each gene is detected in 3.6351648 unique pairwise strain comparison.

The following table shows the list of differentially expressed genes across all unique pairwise comparison, with annotations data and conservation/snp information.

library(kableExtra)
library(formattable)
library(DT)

  # formattable(
  #   list(
  #     `Downregulated` = color_tile("white", "red"),
  #     `Upregulated` = color_tile("white", "blue"),
  #     `regulated` = color_tile("white", "gray")
  #   )
  # ) %>% 

ft_dt = DT::datatable(df_dfe_annot,
        options = list(
            paging = TRUE, pageLength = 20,  ## paginate the output and #rows for each page
            scrollY = TRUE,   ## enable scrolling on X/Y axis
            autoWidth = TRUE, ## use smart column width handling
            server = FALSE,   ## use client-side processing
            dom = 'Bfrtip', buttons = list('csv', 'excel', list(extend = 'colvis')),
            fixedColumns = list(leftColumns = 1),
            columnDefs = list(list(width = '50px', visible=TRUE, targets = "_all"))
          ),
  extensions = c('FixedHeader','FixedColumns','Buttons'),
  selection = 'single',           ## enable selection of a single row
  filter = 'top',              ## include column filters at the bottom
  rownames = FALSE,               ## don't show row numbers/names
  width = NULL, 
  height = NULL,
  caption = NULL
) %>% 
   formatStyle(columns = 1:30, target= 'row',lineHeight='100%', `font-size` = '12px')

ft_dt

9.3 Heatmap of expression for differentially expressed genes


dat_scaled = int_scaled_strains %>% as.data.frame() %>% rownames_to_column('uniprot') 

dfe_exp = dat_scaled %>% dplyr::filter( uniprot %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('uniprot'='UNIPROT'))  %>% 
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,uniprot))%>%
          column_to_rownames(var = 'GENENAME') %>%
          dplyr::select(-ORF,-uniprot,-SGD)

p_dfe_exp=pheatmap::pheatmap(dfe_exp,
                             fontsize = 5,cellwidth = 5,cellheight =5,border_color = NA,treeheight_col = 10,
                             filename = here('plot','heatmap-exp.png'))
knitr::include_graphics('plot/heatmap-exp.png',auto_pdf = T)

9.4 Heatmap of expression differences


dfe_lfc = get_volcano_data(input_data=int_norm_bpca, which='both',min_lfc=2, min_pval=0.01, topn = 20) %>% 
          bind_rows %>% as_tibble() %>%
          pivot_wider(id_cols=ID, names_from = 'comparison', values_from = 'EffectSize') %>% 
          dplyr::filter(ID %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('ID'='UNIPROT')) %>%
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,ID))%>%
          column_to_rownames('GENENAME')

dfe_lfc_mat = dfe_lfc %>% dplyr::select(-ORF,-ID,-SGD)
# Heatmap of differentially expressed genes
pheatmap::pheatmap(dfe_lfc_mat,fontsize = 5,cutree_rows = 10,cellwidth = 5,cellheight =5,border_color = NA,
                          treeheight_col = 10, filename = here('plot','heatmap-lfc.png')  )
knitr::include_graphics('plot/heatmap-lfc.png',auto_pdf = T)

9.5 Cluster genes based on their profile of differential expression strain-pairwise

# Transpose the matrix to calculate distance between experiments row-wise
d_strain <- dfe_lfc_mat %>% t() %>% dist(.,method = "euclidean", diag = FALSE, upper = FALSE)
# Calculate the distance between proteins row-wise 
d_prot <- dfe_lfc_mat %>% dist(.,method = "euclidean", diag = FALSE, upper= FALSE)

hc_prot = hclust(d_prot,method='ward.D2')
cl_prot = hc_prot %>% cutree(k=10)
library(dendextend)
dend_prot = as.dendrogram(hc_prot)
dend_prot = rotate(dend_prot,seq_along(cl_prot))
dend_prot <- color_branches(dend_prot, k=10)
dend_prot <- color_labels(dend_prot, k=10)

#labels_colors(dend) <-rainbow_hcl(3)[sort_levels_values( as.numeric(dend_prot[,5])[order.dendrogram(dend)] )]
dend_prot <- hang.dendrogram(dend_prot,hang_height=0.1)
dend_prot <- set(dend_prot, "labels_cex", 0.5)
# And plot:
par(mar = c(2,3,0,3))
plot(dend_prot,horiz =  TRUE,  nodePar = list(cex = .007))

#legend("topleft", legend = iris_species, fill = rainbow_hcl(10))
par(mar = rep(0,4))
circlize_dendrogram(dend_prot)
#> Loading required namespace: circlize